import json
import sys
from openai import OpenAI

# ============ 配置区 ============
REFERENCE_FILE = "wolfram_test_gold.jsonl"   # 参考(GT)数据：{"instruction": "...", "code": "..."}
GENERATED_FILE = "wolfram_generated.jsonl"   # 生成数据：{"code": "..."}（顺序与参考对齐）
OUTPUT_FILE = "eval_results_wolfram.jsonl"   # 评估输出

BASE_URL = "http://35.220.164.252:3888/v1/"  # 你的代理地址
API_KEY = "sk-xxx"                        # 换成你的API key
MODEL_NAME = "gpt-4o"
# =================================

# 进度条（可选）
try:
    from tqdm import tqdm
    def progress(iterable, total=None): return tqdm(iterable, total=total)
except Exception:
    def progress(iterable, total=None): return iterable

client = OpenAI(base_url=BASE_URL, api_key=API_KEY)


def create_mathematica_evaluation_prompt(instruction: str, reference_code: str, generated_code: str) -> str:
    return f"""# ROLE
You are a senior Wolfram Language (Mathematica) developer and a strict JSON-only grader.

# TASK
Evaluate the [GENERATED CODE] against [REFERENCE CODE] and [INSTRUCTION] using the rubric below.
Then OUTPUT ONE SINGLE-LINE JSON OBJECT ONLY.

# RUBRIC (two dimensions, integer scores 1..5)
1) Code Similarity:
   - Compare implementation logic/structure, core function choices (e.g., Graphics/Plot/ListLinePlot/Manipulate/Animate/Module/With/Block),
     idiomatic WL usage (functional programming, pure functions, patterns, Options handling, scoping constructs),
     option management (PlotRange, Axes, Epilog, Styling), and robustness (determinism, reproducibility).
   - 5: Almost identical structure/approach/APIs; only superficial differences.
   - 4: Same core approach with minor alternative APIs or styles of similar quality/efficiency.
   - 3: Functionally correct but significantly different approach or older/less idiomatic APIs.
   - 2: Substantially different, partially fulfills; clumsy or inappropriate constructs.
   - 1: Unrelated/wrong constructs.

2) Instruction Alignment:
   - Does the resulting output (visual/interactive/sequence/values/layout/styling/timing) satisfy ALL requirements in [INSTRUCTION]?
   - For interactive tasks, check presence and correctness of Manipulate/Animate/Locator, dynamic bindings, control ranges; for plots/graphics, check primitives, styles, labels, ranges, ordering.
   - 5: Perfectly implements all details.
   - 4: All core requirements met; only minor deviations.
   - 3: Main goal achieved but misses several details or one key requirement.
   - 2: Only partial fulfillment with major omissions.
   - 1: Fails to follow instruction.

# INPUTS
[INSTRUCTION]
{instruction}

[REFERENCE CODE]
{reference_code}

[GENERATED CODE]
{generated_code}

# OUTPUT SCHEMA (MUST MATCH KEYS AND TYPES EXACTLY)
Return ONE minified JSON object with EXACTLY these keys:
{{"code_similarity": {{"score": <int 1-5>, "reasoning": "<<=60 words, no newline>"}}, "instruction_alignment": {{"score": <int 1-5>, "reasoning": "<<=60 words, no newline>"}}}}

# HARD CONSTRAINTS — READ CAREFULLY
- Output JSON ONLY. No markdown, no code fences, no prose, no prefix/suffix.
- Do NOT wrap with ``` or ```json.
- The FIRST character MUST be '{{' and the LAST character MUST be '}}'.
- Single line only (no newline characters). No trailing commas.
- Use integers 1..5 for "score".
"""


def force_parse_json(s: str):

    s = s.strip()
    # 去掉常见 markdown 围栏
    if s.startswith("```"):
        s = s.replace("```", "")
        if s.lower().startswith("json"):
            s = s[4:].lstrip()
    # 截取最外层 JSON
    left = s.find("{")
    right = s.rfind("}")
    if left != -1 and right != -1 and right > left:
        s = s[left:right + 1]
    return json.loads(s)


def load_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f if line.strip()]


def evaluate():
    reference_data = load_jsonl(REFERENCE_FILE)
    generated_data = load_jsonl(GENERATED_FILE)

    if len(reference_data) != len(generated_data):
        print(
            f"⚠️  Warning: reference({len(reference_data)}) != generated({len(generated_data)}). "
            f"Will evaluate up to min length.",
            file=sys.stderr
        )

    n = min(len(reference_data), len(generated_data))
    ok_cnt, err_cnt = 0, 0

    with open(OUTPUT_FILE, "w", encoding="utf-8") as out_f:
        for idx in progress(range(n), total=n):
            ref = reference_data[idx]
            gen = generated_data[idx]

            try:
                instruction = ref["instruction"]
                reference_code = ref["Manipulate"]
                generated_code = gen["Manipulate"]
            except KeyError as ke:
                print(f"❌ [idx={idx}] Missing key: {ke}", file=sys.stderr)
                out_f.write(json.dumps({
                    "index": idx,
                    "instruction": ref.get("instruction"),
                    "reference_code": ref.get("Manipulate"),
                    "generated_code": gen.get("Manipulate"),
                    "evaluation": {"error": f"missing key: {ke}"}
                }, ensure_ascii=False) + "\n")
                out_f.flush()
                err_cnt += 1
                continue

            prompt = create_mathematica_evaluation_prompt(
                instruction, reference_code, generated_code
            )

            try:
                resp = client.chat.completions.create(
                    model=MODEL_NAME,
                    messages=[
                        {"role": "system",
                         "content": "You are a strict JSON formatter. Always reply with a single-line compact JSON object matching the requested schema. No markdown, no code fences, no explanations."},
                        {"role": "user", "content": prompt},
                    ],
                    temperature=0
                )
                output_text = resp.choices[0].message.content.strip()

                # 先严格解析，失败再兜底
                try:
                    eval_result = json.loads(output_text)
                except Exception:
                    eval_result = force_parse_json(output_text)

            except Exception as e:
                print(f"❌ [idx={idx}] API/parse error: {e}", file=sys.stderr)
                eval_result = {"error": str(e), "raw_output": output_text if 'output_text' in locals() else None}
                err_cnt += 1
            else:
                ok_cnt += 1

            result_entry = {
                "index": idx,
                "instruction": instruction,
                "reference_code": reference_code,
                "generated_code": generated_code,
                "evaluation": eval_result
            }

            out_f.write(json.dumps(result_entry, ensure_ascii=False) + "\n")
            out_f.flush()  #

    print(f"✅ Done. Saved to {OUTPUT_FILE}. OK={ok_cnt}, ERR={err_cnt}")


if __name__ == "__main__":
    evaluate()